package com.esri.json.hadoop;

import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;


/**
 * 
 * Enumerates records from an Esri Unenclosed JSON file
 * 
 */
/*
 * The JSON will look like this (white-space ignored)
 * 
 * { // start record 1
 * 	"attributes" : {}
 *  "geometry" : {}
 * } // end record 1
 * { // start record 2
 * 	"attributes" : {}
 *  "geometry" : {}
 * } // end record 2
 */
public class UnenclosedEsriJsonRecordReader extends UnenclosedBaseJsonRecordReader {
	static final Log LOG = LogFactory.getLog(UnenclosedEsriJsonRecordReader.class.getName());

	public UnenclosedEsriJsonRecordReader() throws IOException {  // explicit just to declare exception
        super();
	}

	public UnenclosedEsriJsonRecordReader(org.apache.hadoop.mapred.InputSplit split,
									  Configuration conf) throws IOException {
		//attrLabel = "attributes";
        super(split, conf);
	}


	/**
	 * Given an arbitrary byte offset into a unenclosed JSON document, 
	 * find the start of the next record in the document.  Discard trailing
	 * bytes from the previous record if we happened to seek to the middle
	 * of it
	 * 
	 * Record boundary defined as : \{\s*"(attributes|geometry)"\s*:\s*\{
	 * 
	 * @throws IOException
	 */
	protected boolean moveToRecordStart() throws IOException {
		int next = 0;
		long resetPosition = readerPosition;

		// The case of split point exactly at whitespace between records, is
		// handled by forcing it to the split following, in the interest of
		// better balancing the splits, by consuming the whitespace in next().
		// The alternative of forcing it to the split preceding, could be
		// done like what is commented here.
		//   while (next != '{' || skipDup > 0) {  // skipDup>0 => record already consumed
		// 	  next = getChar();
		// 	  if (next < 0)  return false;   // end of stream, no good
		// 	  if (next == '}')  skipDup = -1;  // Definitely not
		// 	  else if (skipDup == 0) skipDup = 1;  // no info - Maybe so until refuted by '}'
		//   }

		while (true) {

			// scan until we reach a {
			while (next != '{') {
				next = getChar();
				
				// end of stream, no good
				if (next < 0) {
					return false;
				}
			}
			
			resetPosition = readerPosition;
			inputReader.mark(100);
			
			// ok last char was '{', skip till we get to a '"'
			next = getNonWhite();
			if (next < 0) {   // end of stream, no good
				return false;
			}
			if (next != '"') {
				continue;
			}

			boolean inEscape = false;
			String fieldName = "";
			// Next should be a field name of  attributes  or  geometry .

            // If we see another opening brace, the previous one must have been inside
            // a quoted string literal (after which the double quote we found, was a
			// closing quote mark rather than the opening quote mark) - start over.

			while (next != '{') {
				next = getChar();
				if (next < 0) {  // end of stream, no good
					return false;
				}

				inEscape = (!inEscape && next == '\\');
				if (!inEscape && next == '"') {
					break;
				}

				fieldName += (char)next;
			}
			
			if (!(fieldName.equals("attributes") || fieldName.equals("geometry"))) {
				// not the field name we were expecting, start over
				continue;
			}
			
			// ok last char was '"', skip till we get to a ':'
			next = getNonWhite();
			if (next < 0) {   // end of stream, no good
				return false;
			}
			if (next != ':') {
				continue;
			}
			
			// and finally, if the next char is a {, we know for sure that this is a valid record
			next = getNonWhite();
			if (next < 0) {   // end of stream, no good
				return false;
			}
			
			if (next == '{') {
				// at this point we can be sure that we have found the record boundary
				break;
			}
		}
		
		inputReader.reset();
		readerPosition = resetPosition;
		
		firstBraceConsumed = true;
		
		return true;
	}

}
